Why does a customer leave telecom company? Most people leave any service because of dissatisfaction with the way they are treated. They would not be looking around if they were happy with their current provider, its service and employees. source
Accenture reports that 77% of consumers are no longer as loyal to brands as they were even three years ago. Much like everyone else, the telecom industry must work harder than ever at customer retention. source
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
customer_info = pd.read_csv('Customer_info.csv',na_values = 'N.A')
customer_info.rename(columns = {'type':'subcriber_type'},inplace = True)
# Dimensions
print("Number of Rows: {} \nNumber of Columns: {}".format(customer_info.shape[0],customer_info.shape[1]))
customer_info.head()
customer_info.custid.nunique()
Postpaid_usage = pd.read_csv('Postpaid_usage.csv',na_values = 'N.A')
Postpaid_usage.rename(columns = {'date':'billing_date'},inplace = True)
# Dimensions
print("Number of Rows: {} \nNumber of Columns: {}".format(Postpaid_usage.shape[0],Postpaid_usage.shape[1]))
Postpaid_usage.head()
Postpaid_usage.custid.nunique()
postpaid_billdate = Postpaid_usage.groupby('custid')['billing_date'].apply(set).reset_index()
postpaid_billdate['billing_date'] = postpaid_billdate['billing_date'].apply(list)
recent_billdate = []
for billdate in postpaid_billdate['billing_date']:
dates = sorted(billdate, key=lambda x: datetime.strptime(x,'%Y-%m-%d'))
recent_billdate.append(dates[-1])
postpaid_billdate['last_billing_date'] = recent_billdate
postpaid_status = Postpaid_usage.groupby('custid')['status'].apply(set).reset_index()
postpaid_status['status'] = postpaid_status['status'].apply(lambda x: str(x)[2:-2])
postpaid_status.head()
postpaid_status.status.value_counts()
postpaid_status.custid.nunique()
postpaid_amount = Postpaid_usage.groupby('custid')['amount'].agg(sum).reset_index()
postpaid_amount.head()
postpaid_amount.custid.nunique()
postpaid_plan = Postpaid_usage.groupby('custid')['plan'].apply(set).reset_index()
postpaid_plan['plan'] = postpaid_plan['plan'].apply(str)
postpaid_plan.head()
postpaid_plan.plan.value_counts()
postpaid_plan.custid.nunique()
post_journey = pd.read_csv('post_journey.csv',na_values = 'N.A')
post_journey.rename(columns = {'date':'event_date','type':'event_type'},inplace = True)
# Dimensions
print("Number of Rows: {} \nNumber of Columns: {}".format(post_journey.shape[0],post_journey.shape[1]))
post_journey.head()
post_journey.custid.nunique()
post_journey_event_date = post_journey.groupby('custid')['event_date'].apply(set).reset_index()
post_journey_event_date['event_date'] = post_journey_event_date['event_date'].apply(list)
post_journey_event_date.head()
last_eventdate = []
for eventdate in post_journey_event_date['event_date']:
dates = sorted(eventdate, key=lambda x: datetime.strptime(x,'%Y-%m-%d'))
last_eventdate.append(dates[-1])
post_journey_event_date['last_event_date'] = last_eventdate
post_journey_event_date.head()
post_journey_event_type = post_journey.groupby('custid')['event_type'].apply(set).reset_index()
post_journey_event_type['event_type'] = post_journey_event_type['event_type'].apply(list)
post_journey_event_type.head()
post_journey_desc = post_journey.groupby('custid')['description'].apply(set).reset_index()
post_journey_desc['description'] = post_journey_desc['description'].apply(list)
post_journey_desc.head()
postpaid_daystoresolve = post_journey.groupby('custid')['daystoresolve'].agg(sum).reset_index()
postpaid_daystoresolve.head()
# compile the list of dataframes you want to merge
data_frames = [customer_info, postpaid_billdate, postpaid_status, postpaid_amount,postpaid_plan,
post_journey_event_date,post_journey_event_type,post_journey_desc,postpaid_daystoresolve]
from functools import reduce
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['custid'],
how='outer'), data_frames).fillna(np.nan)
df_merged.custid.nunique()
# Dimensions
print("Number of Rows: {} \nNumber of Columns: {}".format(df_merged.shape[0],df_merged.shape[1]))
df_merged.head()
def days_between(d1, d2):
d1 = datetime.strptime(d1, "%Y-%m-%d")
d2 = datetime.strptime(d2, "%Y-%m-%d")
return abs((d2 - d1).days)
days_btw_events = []
for ind,date in enumerate(df_merged['event_date']):
try:
dates = sorted(date, key=lambda x: datetime.strptime(x,'%Y-%m-%d'))
days_btw_events.append(days_between(dates[0],dates[-1]))
except:
days_btw_events.append(0)
df_merged['event_date_gap'] = days_btw_events
df_merged.head()
df_merged.describe()
df_merged.isnull().sum()
For 208 customers we don't have the status. Therefore we will drop those 208 rows
df_merged.dropna(subset = ['status'],inplace = True)
df_merged = df_merged.reset_index(drop = True)
df_merged['event_type'].fillna('None',inplace = True)
df_merged['description'].fillna('None',inplace = True)
df_merged['daystoresolve'].fillna(0,inplace = True)
df_merged.isnull().sum()
df_merged.shape
# credits: The function was taken from https://www.kaggle.com/artgor/brute-force-feature-engineering
def reduce_memory(df, verbose=True):
"""Function to reduce memory size of attributes"""
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Memory usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
# Before Memory reduction
print("Total memory used before Memory reduction {:5.2f}Mb".format(df_merged.memory_usage().sum() / 1024**2))
# After Memory reduction
df_merged = reduce_memory(df_merged)
print("Total memory used after Memory reduction {:5.2f}Mb".format(df_merged.memory_usage().sum() / 1024**2))
for this, lets consider postpaid_usage data
Postpaid_usage['billing_date'] = pd.to_datetime(Postpaid_usage['billing_date'])
Postpaid_usage.set_index('billing_date',inplace = True)
# Average monthly usage (average monthly amount)
Postpaid_usage['amount'].resample('M').agg('mean')
import plotly.graph_objects as go
labels = ['Exited','Continued']
values = [df_merged.status[df_merged['status']=='inactive'].count(), df_merged.status[df_merged['status']=='active'].count()]
colors = ['red', 'darkorange']
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(hole=.4, hoverinfo='label+value', textfont_size=20,
marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(
title_text="Ratio of Customer Churned and Retained")
fig.show()
df_merged.status.value_counts()
~23% of customer's have left the telecom company. We cannot use accuracy as our only metric because even if we predict all the customers to be continued, we will still end up getting ~77% accuracy. Our focus should be on churn rate i.e., on the minority class - 'inactive'
df_merged['status'] = df_merged['status'].map({'inactive':1,'active':0})
region_cust = pd.DataFrame(df_merged.county.value_counts()).reset_index()
region_cust.rename(columns = {'county':'count','index':'county'},inplace = True)
region_cust
Jefferson (785) & Washington (600) has most of the customers whereas Green,Kinney regions have only 1 customer
top_regions = region_cust['county'][0:10].tolist()
top_regions
region_df = df_merged[df_merged['county'].isin(top_regions)].reset_index(drop = True)
region_df.head()
region_df['county'].value_counts()
# top 10 county vs status(Churn)
sns.factorplot(x='county',y='status', data=region_df,height=4,aspect=3)
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='county', data=region_df, ax=axis1)
sns.countplot(x='status', hue="county", data=region_df, order=[1,0], ax=axis2)
# group by Geography, and get the mean for Churned customers for each value in Geography
geography_perc = region_df[["county", "status"]].groupby(['county'],as_index=False).mean()
geography_perc.columns = ['county', 'Mean(status)']
sns.barplot(x='county', y='Mean(status)', data=geography_perc,order=['Jefferson',
'Washington',
'Montgomery',
'Jackson',
'Franklin',
'Harris',
'Monroe',
'Polk',
'Dallas',
'Erie'],ax=axis3)
del geography_perc
From the above graph we can observe that customers from Harris, Monroe & Polk have slighly higher average churn rate (~0.25) whilst Jackson has the least average churn rate (0.18) amongst the top 10 frequent regions.
# Amount
# peaks for Exited/not exited customers by their amount
facet = sns.FacetGrid(df_merged, hue="status",aspect=4)
facet.map(sns.kdeplot,'amount',shade= True)
facet.set(xlim=(0, df_merged['amount'].max()))
facet.add_legend()
# average exited customers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_amount = df_merged[["amount", "status"]].groupby(['amount'],as_index=False).mean()
average_amount.columns = ['amount','Mean(status)']
sns.barplot(x='amount', y='Mean(status)', data=average_amount)
del average_amount
# average amount for active customers
avg_active = np.mean(df_merged['amount'][df_merged['status'] == 0].values)
print('Average amount for active customers: ',avg_active)
# average amount for innactive customers
avg_inactive = np.mean(df_merged['amount'][df_merged['status'] == 1].values)
print('Average amount for inactive customers: ',avg_inactive)
The difference is not that significant. Also the data is imblanced, thus any conclusion made has to be done keeping the ratio of active vs inactive customers in mind.
plan_zipcode = df_merged.groupby('zipcode')['plan'].apply(lambda x: x.mode().iloc[0]).reset_index()
plan_zipcode
# check - for zipcode = 88436 is 1_Gbps plan the most used?
a = df_merged[df_merged['zipcode'] == 88436].reset_index(drop = True)
a['plan'].value_counts()
yes! For zipcode = 88436 is 1_Gbps plan is the most used (2)
plan_status = df_merged.groupby('status')['plan'].apply(lambda x: x.mode().iloc[0]).reset_index()
plan_status
Most customers who have churned out used 200_Mbps plan
# Check
a = df_merged[df_merged['status'] == 1].reset_index(drop = True)
a['plan'].value_counts()
who is more loyal?
# Gender vs Churn
f,ax=plt.subplots(1,2,figsize=(18,10))
df_merged[['gender','status']].groupby(['gender']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Churn vs Gender')
sns.countplot('gender',hue='status',data=df_merged,ax=ax[1])
ax[1].set_title('Gender:Churned vs Retained')
print(df_merged[['gender','status']].groupby(['gender']).mean())
print()
print("# Female - Exited: ",len(df_merged[(df_merged['status'] == 1) & (df_merged['gender'] == 'F')]))
print("# Female - Retained: ",len(df_merged[(df_merged['status'] == 0) & (df_merged['gender'] == 'F')]))
print()
print("# Male - Exited: ",len(df_merged[(df_merged['status'] == 1) & (df_merged['gender'] == 'M')]))
print("# Male - Retained: ",len(df_merged[(df_merged['status'] == 0) & (df_merged['gender'] == 'M')]))
Both Male and Female have almost the same churn rate.
customer_info['subcriber_type'].value_counts()
subcriber type is a static column that contains only one value 'kinetic'. Lets drop the column
df_merged.drop('subcriber_type',axis = 1,inplace = True)
df_merged.columns
desc_status = df_merged.groupby('status')['description'].apply(lambda x: x.mode().iloc[0]).reset_index()
desc_status
# Check
a = df_merged[df_merged['status'] == 1].reset_index(drop = True)
a['description'].value_counts()
Disconnect & New_installation are most used description by the churned customers (797)
# daystoresolve
y0 = df_merged.daystoresolve[df_merged.status == 0].values
y1 = df_merged.daystoresolve[df_merged.status == 1].values
fig = go.Figure()
fig.add_trace(go.Box(y=y0, name='Continued',
marker_color = 'blue'))
fig.add_trace(go.Box(y=y1, name = 'Exited',
marker_color = 'red'))
fig.update_layout(
yaxis_title='Days to resolve'
)
fig.show()
# daystoresolve
# peaks for Exited/not exited customers by their daystoresolve
facet = sns.FacetGrid(df_merged, hue="status",aspect=4)
facet.map(sns.kdeplot,'daystoresolve',shade= True)
facet.set(xlim=(0, df_merged['daystoresolve'].max()))
facet.add_legend()
# average exited customers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_daystoresolve = df_merged[["daystoresolve", "status"]].groupby(['daystoresolve'],as_index=False).mean()
average_daystoresolve.columns = ['daystoresolve','Mean(status)']
sns.barplot(x='daystoresolve', y='Mean(status)', data=average_daystoresolve)
del average_daystoresolve
No real difference in days to resolve between churned and active customers
df_merged['dob'] = pd.to_datetime(df_merged['dob'])
df_merged['age'] = (pd.to_datetime('today').year - pd.to_datetime(df_merged['dob']).dt.year)
df_merged['age']
# age
y0 = df_merged.age[df_merged.status == 0].values
y1 = df_merged.age[df_merged.status == 1].values
fig = go.Figure()
fig.add_trace(go.Box(y=y0, name='Continued',
marker_color = 'blue'))
fig.add_trace(go.Box(y=y1, name = 'Exited',
marker_color = 'red'))
fig.update_layout(
yaxis_title='Age'
)
fig.show()
No difference in age among active and inactive customers
#df_merged['doj'] = pd.to_datetime(df_merged['doj'])
df_merged['service'] = (pd.to_datetime('today').year - pd.to_datetime(df_merged['doj']).dt.year)
df_merged['service']
# Service
# peaks for Exited/not exited customers by their service
facet = sns.FacetGrid(df_merged, hue="status",aspect=4)
facet.map(sns.kdeplot,'service',shade= True)
facet.set(xlim=(0, df_merged['service'].max()))
facet.add_legend()
# average exited customers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_service = df_merged[["service", "status"]].groupby(['service'],as_index=False).mean()
average_service.columns = ['service','Mean(status)']
sns.barplot(x='service', y='Mean(status)', data=average_service)
del average_service
Customer with 1 year (usuage) from Date of joining have the highest Mean Churn rate ~0.33.
df_merged['log_amount'] = np.log10(df_merged['amount'])
df_merged.tail()
# log Amount
# peaks for Exited/not exited customers by their log amount
facet = sns.FacetGrid(df_merged, hue="status",aspect=4)
facet.map(sns.kdeplot,'log_amount',shade= True)
facet.set(xlim=(0, df_merged['log_amount'].max()))
facet.add_legend()
# average exited customers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_logamount = df_merged[["log_amount", "status"]].groupby(['log_amount'],as_index=False).mean()
average_logamount.columns = ['log_amount','Mean(status)']
sns.barplot(x='log_amount', y='Mean(status)', data=average_logamount)
del average_logamount
drop_cols = ['name','custid','dob','doj','state_id','doorno','zipcode','phone','billing_date','last_billing_date','event_date',
'last_event_date','log_amount','description']
df = df_merged.drop(drop_cols,axis = 1)
# Dimensions
print("Number of Rows: {} \nNumber of Columns: {}".format(df.shape[0],df.shape[1]))
df.tail()
cordata = df.corr(method ='pearson')
cordata.style.background_gradient(cmap='summer')
There are no real highly correlated features.
We can see some independent features are correlated among themselves (amount & service, event_date_gap & sevice, event_date_gap & amount) but it has been stated that multi-collinearity is not an issue when using sckit-learn models.
## Convert column formats
df['event_type'] = df['event_type'].apply(str)
#df['description'] = df['description'].apply(str)
Let's split the data into Train, CV and Test sets.
# dependent variable
y = df['status'].values
# independent variables
X = df.drop(['status'], axis=1)
X.head(2)
# train cv test split - stratified sampling
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y,random_state = 17)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train,random_state = 17)
# reset index
X_train = X_train.reset_index(drop = True)
X_cv = X_cv.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
# Save Train CV and test set
X_train.to_csv('Train_churn.csv',index = False)
X_test.to_csv('Test_churn.csv',index = False)
X_cv.to_csv('CV_churn.csv',index = False)
print("train dimensions: ",X_train.shape, y_train.shape)
print("cv dimensions: ",X_cv.shape, y_cv.shape)
print("test dimensions: ",X_test.shape, y_test.shape)
Preparing data for Model building.! Encoding categorical data and normalizing numerical data.
Each feature is of different scales/units. Features like amount have higher range of values compared to Age, Service. We need to standardise features before feeding them into our Models.
from sklearn.preprocessing import StandardScaler
# features to standardise
cols_norm = ['amount','daystoresolve','event_date_gap','age','service']
sc = StandardScaler()
sc.fit(X_train[cols_norm]) # fit has to happen only on train set
X_train[cols_norm] = sc.transform(X_train[cols_norm])
X_cv[cols_norm] = sc.transform(X_cv[cols_norm])
X_test[cols_norm] = sc.transform(X_test[cols_norm])
print("Standardized!")
X_train.head()
# Encoding Gender
X_train['gender'] = X_train['gender'].apply(lambda x: 1 if x == 'M' else 0)
X_cv['gender'] = X_cv['gender'].apply(lambda x: 1 if x == 'M' else 0)
X_test['gender'] = X_test['gender'].apply(lambda x: 1 if x == 'M' else 0)
# Encoding state
from sklearn.preprocessing import LabelEncoder
le_state = LabelEncoder()
le_state.fit(df['state'])
X_train['state'] = le_state.transform(X_train['state'])
X_cv['state'] = le_state.transform(X_cv['state'])
X_test['state'] = le_state.transform(X_test['state'])
# Encoding city
from sklearn.preprocessing import LabelEncoder
le_city = LabelEncoder()
le_city.fit(df['city'])
X_train['city'] = le_city.transform(X_train['city'])
X_cv['city'] = le_city.transform(X_cv['city'])
X_test['city'] = le_city.transform(X_test['city'])
# Encoding county
from sklearn.preprocessing import LabelEncoder
le_county = LabelEncoder()
le_county.fit(df['county'])
X_train['county'] = le_county.transform(X_train['county'])
X_cv['county'] = le_county.transform(X_cv['county'])
X_test['county'] = le_county.transform(X_test['county'])
# Encoding plan
from sklearn.preprocessing import LabelEncoder
le_plan = LabelEncoder()
le_plan.fit(df['plan'])
X_train['plan'] = le_plan.transform(X_train['plan'])
X_cv['plan'] = le_plan.transform(X_cv['plan'])
X_test['plan'] = le_plan.transform(X_test['plan'])
# Encoding event_type
from sklearn.preprocessing import LabelEncoder
le_event_type = LabelEncoder()
le_event_type.fit(df['event_type'])
X_train['event_type'] = le_event_type.transform(X_train['event_type'])
X_cv['event_type'] = le_event_type.transform(X_cv['event_type'])
X_test['event_type'] = le_event_type.transform(X_test['event_type'])
print('Encoded!')
X_train.head()
Assuming all the above X features will be available at the time of prediction (Live/Production), we will continue to build our model
def batch_predict(clf, data):
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_data_pred = []
tr_loop = data.shape[0] - data.shape[0]%1000
# consider you X_tr shape is 49041, then your tr_loop will be 49041 - 49041%1000 = 49000
# in this for loop we will iterate unti the last 1000 multiplier
for i in range(0, tr_loop, 1000):
y_data_pred.extend(clf.predict_proba(data[i:i+1000])[:,1]) # Predict proba - Predicts probability scores
# we will be predicting for the last data points
if data.shape[0]%1000 !=0:
y_data_pred.extend(clf.predict_proba(data[tr_loop:])[:,1])
return y_data_pred
def uniform_random_sample(start,end,size=10):
"""
Function to generate n unique values(uniform random distribution) in the given range "param_range"
size = n (default = 10)
"""
random_int = []
while len(random_int) != size:
a = np.random.randint(start,end)
if a not in random_int:
random_int.append(a)
random_int.sort()
return random_int
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
train_auc = []
cv_auc = []
depths = [1, 5, 10, 50] # list of various max depth values we want to compare.
min_samples_splits = [5,10,100,500] # list of various min_samples_split values we want to compare.
hyperparams = [(d,m) for d in depths for m in min_samples_splits]
hyperparameter_indices = uniform_random_sample(0,len(hyperparams))
params_list = [hyperparams[i] for i in hyperparameter_indices]
depths_ls = [i[0] for i in params_list]
min_samples_splits_ls = [i[1] for i in params_list]
for i in tqdm(params_list):
# initialize DT Model with max_depth = i[0] and min_samples_split = i[1]
dt = DecisionTreeClassifier(max_depth=i[0],min_samples_split=i[1],random_state = 20)
dt.fit(X_train, y_train) # fit the NB model on the train data
y_train_pred = batch_predict(dt, X_train) # Predict on the train data
y_cv_pred = batch_predict(dt, X_cv) # Predict on cross validation data
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
train_auc.append(roc_auc_score(y_train,y_train_pred))
cv_auc.append(roc_auc_score(y_cv, y_cv_pred))
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
x1 = min_samples_splits_ls
y1 = depths_ls
z1 = train_auc
x2 = min_samples_splits_ls
y2 = depths_ls
z2 = cv_auc
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=x1,y=y1,z=z1, name = 'train auc')
trace2 = go.Scatter3d(x=x2,y=y2,z=z2, name = 'cv auc')
data = [trace1, trace2]
layout = go.Layout(scene = dict(
xaxis = dict(title='min_samples_split'),
yaxis = dict(title='max_depth'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
# best hyperparameters
best_depth = 50
best_min_samples_split = 100
best_dt = DecisionTreeClassifier(max_depth = best_depth, min_samples_split=best_min_samples_split,random_state = 20)
best_dt.fit(X_train,y_train)
from sklearn.metrics import roc_curve, auc
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = batch_predict(best_dt, X_train)
y_test_pred = batch_predict(best_dt, X_test)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="Train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="Test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC AUC Curve")
plt.grid()
plt.show()
print('Decision Tree --> Train AUC: ',auc(train_fpr, train_tpr))
print('Decision Tree --> Test AUC: ',auc(test_fpr, test_tpr))
The difference between the train and the test auc scores is very low and both the auc scores are high. Thus the model performed well.
# we will pick a threshold that will give the least fpr
import numpy as np
def find_best_threshold(threshold, fpr, tpr):
t = threshold[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("The maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshold):
predictions = []
for i in proba:
if i>=threshold:
predictions.append(1)
else:
predictions.append(0)
return predictions
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
cm_te = confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t))
import seaborn as sns
# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
# Confusion matrix on test data
print()
print("Confusion matrix: Test data")
ax= plt.subplot()
sns.heatmap(cm_te,annot=True,fmt='1g',cbar = False,cmap = 'YlOrRd')
ax.set_xlabel('Predicted Class Labels');
ax.set_ylabel('Actual Class Labels');
ax.set_title('Confusion-Matrix: Test');
plt.show()
## Classification report
from sklearn.metrics import classification_report
cr = classification_report(y_test,predict_with_best_t(y_test_pred, best_t))
print(cr)
Our precision for Positive class (1 --> Churned/Exited) is ~65% which has to be improved. However f1-score is reasonable for the positive class.
import xgboost as xgb
train_auc = []
cv_auc = []
n_trees = [50,100,200,300,500,750,1000] # list of various n_estimator values we want to compare.
l_rate = [0.0001,0.001,0.01,0.1,1.0,10] # list of various learning rate values we want to compare.
hyperparams = [(n,l) for n in n_trees for l in l_rate]
hyperparameter_indices = uniform_random_sample(0,len(hyperparams),size = 15)
params_list = [hyperparams[i] for i in hyperparameter_indices]
n_trees_ls = [i[0] for i in params_list]
l_rate_ls = [i[1] for i in params_list]
for i in tqdm(params_list):
# initialize XGBoost Model with n_estimators = i[0] and learning_rate = i[1]
xg = xgb.XGBClassifier(n_estimators=i[0],learning_rate=i[1],random_state = 17)
xg.fit(X_train, y_train) # fit the Xgboost model on the train data
y_train_pred = batch_predict(xg, X_train) # Predict on the train data
y_cv_pred = batch_predict(xg, X_cv) # Predict on cross validation data
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
train_auc.append(roc_auc_score(y_train,y_train_pred))
cv_auc.append(roc_auc_score(y_cv, y_cv_pred))
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
x1 = l_rate_ls
y1 = n_trees_ls
z1 = train_auc
x2 = l_rate_ls
y2 = n_trees_ls
z2 = cv_auc
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=x1,y=y1,z=z1, name = 'train auc')
trace2 = go.Scatter3d(x=x2,y=y2,z=z2, name = 'cv auc')
data = [trace1, trace2]
layout = go.Layout(scene = dict(
xaxis = dict(title='Learning Rate'),
yaxis = dict(title='n_estimators'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
# best hyperparameters
best_n_estimator = 750
best_l_rate = 0.1
best_xgb = xgb.XGBClassifier(n_estimators=best_n_estimator,learning_rate=best_l_rate,random_state = 17)
best_xgb.fit(X_train,y_train)
from sklearn.metrics import roc_curve, auc
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = batch_predict(best_xgb, X_train)
y_test_pred = batch_predict(best_xgb, X_test)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="Train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="Test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC AUC Curve")
plt.grid()
plt.show()
print('XGBoost --> Train AUC: ',auc(train_fpr, train_tpr))
print('XGBoost --> Test AUC: ',auc(test_fpr, test_tpr))
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
cm_te = confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t))
import seaborn as sns
# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
# Confusion matrix on test data
print()
print("Confusion matrix: Test data")
ax= plt.subplot()
sns.heatmap(cm_te,annot=True,fmt='1g',cbar = False,cmap = 'YlOrRd')
ax.set_xlabel('Predicted Class Labels');
ax.set_ylabel('Actual Class Labels');
ax.set_title('Confusion-Matrix: Test');
plt.show()
## Classification report -- XGBoost Model
from sklearn.metrics import classification_report
cr = classification_report(y_test,predict_with_best_t(y_test_pred, best_t))
print('XGBoost Model:\n',cr)
XGBoost have performed well. F1-score, precision and recall are all good.
# feature importance
fe_imp = pd.DataFrame({'Feature':X_train.columns.tolist(),'Importance':best_xgb.feature_importances_})
print(fe_imp.sort_values(by=['Importance'], ascending=False))
import plotly.express as px
fig = px.bar(fe_imp, x='Feature', y='Importance')
fig.show()
from prettytable import PrettyTable
pt = PrettyTable(['Model','Hyper parameter','Auc on testset'])
pt.add_row(['Decision Tree','best_depth = 50, best_min_samples_split = 100',0.94])
pt.add_row(['XGBoost','n_estimator = 750, learning_rate = 0.1',0.97])
print(pt)
Assuming that all the features used during model building will be available for the test set (during Live/Production), the models have performed a good job in predicting the potential churn of the customer.
Service, event_date_gap & event_type were the top three most important features (XGBOOST Model).
import pickle
# Save model
with open("xgbfiles.pickle","wb") as f:
pickle.dump(sc, f)
pickle.dump(le_state, f)
pickle.dump(le_city, f)
pickle.dump(le_county, f)
pickle.dump(le_plan, f)
pickle.dump(le_event_type, f)
pickle.dump(best_xgb, f)
print("Files Saved!")
"""
# we should read them in the same order as we dumped.
with open("xgbfiles.pickle", "rb") as f:
sc = pickle.load(f)
le_state = pickle.load(f)
le_city = pickle.load(f)
le_county = pickle.load(f)
le_plan = pickle.load(f)
le_event_type = pickle.load(f)
best_xgb = pickle.load(f)
"""